Supervised Learning-II#
Linear Regression#
Task: Regression
Simple Linear Regression
Input has only one variable
Model: y=ax+b
Closest line to the data points.
Sum of squares of the vertical lengths (residuals) are added together and minimized
Multiple Linear Regression
Input has more than one variables
Model: If there are two variables \(x_1\) and \(x_2\)
\(y=a_1x_1+a_2x_2+b\)
Closest line to the data points.
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0,1,20)
noise = np.random.randn(20)/3
y = 2*x+3+noise
x_train = np.array([x[i] for i in [1,4,6,14,18]])
y_train = np.array([y[i] for i in [1,4,6,14,18]])
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train.reshape(-1,1),y_train)
y_l = lin_reg.predict(x_train.reshape(-1,1))
for i in range(x_train.shape[0]):
plt.plot([x_train[i],x_train[i]], [y_train[i], y_l[i]], 'r--')
plt.scatter(x_train,y_train, label='training', c='b')
plt.plot(x_train,y_l,label= 'linear model', c='orange')
plt.title('Linear Model and Residuals', fontsize=20)
plt.legend();
Simple Linear Regression#
# Use Boston housing data
# Use LSTAT to predict price
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()
dataset.keys()
dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])
# feature names
dataset.feature_names
['MedInc',
'HouseAge',
'AveRooms',
'AveBedrms',
'Population',
'AveOccup',
'Latitude',
'Longitude']
# X,y
X = dataset.data[:,0]
y = dataset.target
# X,y shapes
X.shape, y.shape
((20640,), (20640,))
# scatter plot of MedInc vs MEDV
import matplotlib.pyplot as plt
plt.scatter(X,y)
plt.xlabel('MedInc')
plt.ylabel('MEDV')
plt.title('California Housing Price');
# train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# plot training and test set
plt.figure(figsize=(10,10))
plt.scatter(X_train,y_train, label='Training Set', c='blue')
plt.scatter(X_test,y_test, label='Test Set', c='r')
plt.xlabel('MedInc')
plt.ylabel('MDEV')
plt.title('California Housing Price')
plt.legend();
# use plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots()
fig.add_trace(go.Scatter(x=X_train, y=y_train, mode='markers', marker=dict(color='blue'), name="training") )
fig.add_trace(go.Scatter(x=X_test, y=y_test, mode='markers',marker=dict(color='red'), name="test") )
fig.update_layout(title_text="Training and Test Sets for California Housing Data")
# fit model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train.reshape(-1,1),y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# intercept
b = lin_reg.intercept_
b
0.44879836696807707
# ceoefficient
m = lin_reg.coef_
m
array([0.41731856])
# line x and y
import numpy as np
x_lin = np.linspace(0,40,100)
y_lin = m*x_lin+b
# use plotly to plot training, test, linear model
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots()
fig.add_trace(go.Scatter(x=X_train, y=y_train, mode='markers', marker=dict(color='blue'), name="training") )
fig.add_trace(go.Scatter(x=X_test, y=y_test, mode='markers',marker=dict(color='red'), name="test") )
fig.add_trace(go.Scatter(x=x_lin, y=y_lin ,marker=dict(color='green'), name="Linear Model") )
fig.update_layout(title_text="Linear Model")
# training score
lin_reg.score(X_train.reshape(-1,1), y_train)
0.4738509942209922
# test score
lin_reg.score(X_test.reshape(-1,1), y_test)
0.4725720683367075
#Restrict the data to y<40, y_r
y_r = y[y < 40]
y_r.shape
(20640,)
# X_r
X_r = X[y < 40]
X_r.shape
(20640,)
# train test split
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_r, y_r, test_size=0.33, random_state=42)
# fit the model
lin_reg = LinearRegression()
lin_reg.fit(Xr_train.reshape(-1,1),yr_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# coefficient
m = lin_reg.coef_
m
array([0.41731856])
# intercept
b = lin_reg.intercept_
b
0.44879836696807707
# line x and y
xr_lin = np.linspace(0,40,100)
yr_lin = m*xr_lin+b
# # plotly restricted train, test, linear model
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots()
fig.add_trace(go.Scatter(x=Xr_train, y=yr_train, mode='markers', marker=dict(color='blue'), name="training") )
fig.add_trace(go.Scatter(x=Xr_test, y=yr_test, mode='markers',marker=dict(color='red'), name="test") )
fig.add_trace(go.Scatter(x=xr_lin, y=yr_lin ,marker=dict(color='green'), name="Linear Model") )
fig.update_layout(title_text="Linear Model")
# training score
lin_reg.score(Xr_train.reshape(-1,1), yr_train)
0.4738509942209922
# test score
lin_reg.score(Xr_test.reshape(-1,1), yr_test)
0.4725720683367075
Multiple Linear Regression#
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# fit the model
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# intercept
b = lin_reg.intercept_
b
-37.08201093908004
# coefficients
m = lin_reg.coef_
m
array([ 4.44870466e-01, 9.55004561e-03, -1.21991503e-01, 7.79144696e-01,
-7.68990809e-08, -3.29948505e-03, -4.19131153e-01, -4.34103468e-01])
# coefficient shape
m.shape
(8,)
# training score
lin_reg.score(X_train , y_train )
0.609370412027382
# test score
lin_reg.score(X_test , y_test )
0.597049412878397
# actual vs predicted scatter plot
plt.scatter(y_test ,lin_reg.predict(X_test ))
plt.plot([0,10],[0,10], 'r--')
plt.title('Comparison of Actual and Predicted Values')
plt.xlabel('actual')
plt.ylabel('predicted');
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, lin_reg.coef_);
sorted = np.argsort(lin_reg.coef_)
sorted
array([7, 6, 2, 5, 4, 1, 0, 3])
dataset.feature_names
['MedInc',
'HouseAge',
'AveRooms',
'AveBedrms',
'Population',
'AveOccup',
'Latitude',
'Longitude']
np.array(dataset.feature_names)[sorted]
array(['Longitude', 'Latitude', 'AveRooms', 'AveOccup', 'Population',
'HouseAge', 'MedInc', 'AveBedrms'], dtype='<U10')
lin_reg.coef_[sorted]
array([-4.34103468e-01, -4.19131153e-01, -1.21991503e-01, -3.29948505e-03,
-7.68990809e-08, 9.55004561e-03, 4.44870466e-01, 7.79144696e-01])
# sorted coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(np.array(dataset.feature_names)[sorted], lin_reg.coef_[sorted]);
Desicion Tree#
consists of a hierarchy of if/else questions
predict the value of a target variable by answering these if/else questions
find the smallest tree that fits the data.
for regression tasks questions are less than a number or not.
Gini Impurity#
Start with the question which minimizes
\(G = 1- \sum p_i^2\)
Desicion Tree Classifier#
# instantiate the class into an object
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
# help
help(dtc)
Help on DecisionTreeClassifier in module sklearn.tree._classes object:
class DecisionTreeClassifier(sklearn.base.ClassifierMixin, BaseDecisionTree)
| DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
|
| A decision tree classifier.
|
| Read more in the :ref:`User Guide <tree>`.
|
| Parameters
| ----------
| criterion : {"gini", "entropy", "log_loss"}, default="gini"
| The function to measure the quality of a split. Supported criteria are
| "gini" for the Gini impurity and "log_loss" and "entropy" both for the
| Shannon information gain, see :ref:`tree_mathematical_formulation`.
|
| splitter : {"best", "random"}, default="best"
| The strategy used to choose the split at each node. Supported
| strategies are "best" to choose the best split and "random" to choose
| the best random split.
|
| max_depth : int, default=None
| The maximum depth of the tree. If None, then nodes are expanded until
| all leaves are pure or until all leaves contain less than
| min_samples_split samples.
|
| min_samples_split : int or float, default=2
| The minimum number of samples required to split an internal node:
|
| - If int, then consider `min_samples_split` as the minimum number.
| - If float, then `min_samples_split` is a fraction and
| `ceil(min_samples_split * n_samples)` are the minimum
| number of samples for each split.
|
| .. versionchanged:: 0.18
| Added float values for fractions.
|
| min_samples_leaf : int or float, default=1
| The minimum number of samples required to be at a leaf node.
| A split point at any depth will only be considered if it leaves at
| least ``min_samples_leaf`` training samples in each of the left and
| right branches. This may have the effect of smoothing the model,
| especially in regression.
|
| - If int, then consider `min_samples_leaf` as the minimum number.
| - If float, then `min_samples_leaf` is a fraction and
| `ceil(min_samples_leaf * n_samples)` are the minimum
| number of samples for each node.
|
| .. versionchanged:: 0.18
| Added float values for fractions.
|
| min_weight_fraction_leaf : float, default=0.0
| The minimum weighted fraction of the sum total of weights (of all
| the input samples) required to be at a leaf node. Samples have
| equal weight when sample_weight is not provided.
|
| max_features : int, float or {"auto", "sqrt", "log2"}, default=None
| The number of features to consider when looking for the best split:
|
| - If int, then consider `max_features` features at each split.
| - If float, then `max_features` is a fraction and
| `max(1, int(max_features * n_features_in_))` features are considered at
| each split.
| - If "auto", then `max_features=sqrt(n_features)`.
| - If "sqrt", then `max_features=sqrt(n_features)`.
| - If "log2", then `max_features=log2(n_features)`.
| - If None, then `max_features=n_features`.
|
| .. deprecated:: 1.1
| The `"auto"` option was deprecated in 1.1 and will be removed
| in 1.3.
|
| Note: the search for a split does not stop until at least one
| valid partition of the node samples is found, even if it requires to
| effectively inspect more than ``max_features`` features.
|
| random_state : int, RandomState instance or None, default=None
| Controls the randomness of the estimator. The features are always
| randomly permuted at each split, even if ``splitter`` is set to
| ``"best"``. When ``max_features < n_features``, the algorithm will
| select ``max_features`` at random at each split before finding the best
| split among them. But the best found split may vary across different
| runs, even if ``max_features=n_features``. That is the case, if the
| improvement of the criterion is identical for several splits and one
| split has to be selected at random. To obtain a deterministic behaviour
| during fitting, ``random_state`` has to be fixed to an integer.
| See :term:`Glossary <random_state>` for details.
|
| max_leaf_nodes : int, default=None
| Grow a tree with ``max_leaf_nodes`` in best-first fashion.
| Best nodes are defined as relative reduction in impurity.
| If None then unlimited number of leaf nodes.
|
| min_impurity_decrease : float, default=0.0
| A node will be split if this split induces a decrease of the impurity
| greater than or equal to this value.
|
| The weighted impurity decrease equation is the following::
|
| N_t / N * (impurity - N_t_R / N_t * right_impurity
| - N_t_L / N_t * left_impurity)
|
| where ``N`` is the total number of samples, ``N_t`` is the number of
| samples at the current node, ``N_t_L`` is the number of samples in the
| left child, and ``N_t_R`` is the number of samples in the right child.
|
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
| if ``sample_weight`` is passed.
|
| .. versionadded:: 0.19
|
| class_weight : dict, list of dict or "balanced", default=None
| Weights associated with classes in the form ``{class_label: weight}``.
| If None, all classes are supposed to have weight one. For
| multi-output problems, a list of dicts can be provided in the same
| order as the columns of y.
|
| Note that for multioutput (including multilabel) weights should be
| defined for each class of every column in its own dict. For example,
| for four-class multilabel classification weights should be
| [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
| [{1:1}, {2:5}, {3:1}, {4:1}].
|
| The "balanced" mode uses the values of y to automatically adjust
| weights inversely proportional to class frequencies in the input data
| as ``n_samples / (n_classes * np.bincount(y))``
|
| For multi-output, the weights of each column of y will be multiplied.
|
| Note that these weights will be multiplied with sample_weight (passed
| through the fit method) if sample_weight is specified.
|
| ccp_alpha : non-negative float, default=0.0
| Complexity parameter used for Minimal Cost-Complexity Pruning. The
| subtree with the largest cost complexity that is smaller than
| ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
| :ref:`minimal_cost_complexity_pruning` for details.
|
| .. versionadded:: 0.22
|
| Attributes
| ----------
| classes_ : ndarray of shape (n_classes,) or list of ndarray
| The classes labels (single output problem),
| or a list of arrays of class labels (multi-output problem).
|
| feature_importances_ : ndarray of shape (n_features,)
| The impurity-based feature importances.
| The higher, the more important the feature.
| The importance of a feature is computed as the (normalized)
| total reduction of the criterion brought by that feature. It is also
| known as the Gini importance [4]_.
|
| Warning: impurity-based feature importances can be misleading for
| high cardinality features (many unique values). See
| :func:`sklearn.inspection.permutation_importance` as an alternative.
|
| max_features_ : int
| The inferred value of max_features.
|
| n_classes_ : int or list of int
| The number of classes (for single output problems),
| or a list containing the number of classes for each
| output (for multi-output problems).
|
| n_features_in_ : int
| Number of features seen during :term:`fit`.
|
| .. versionadded:: 0.24
|
| feature_names_in_ : ndarray of shape (`n_features_in_`,)
| Names of features seen during :term:`fit`. Defined only when `X`
| has feature names that are all strings.
|
| .. versionadded:: 1.0
|
| n_outputs_ : int
| The number of outputs when ``fit`` is performed.
|
| tree_ : Tree instance
| The underlying Tree object. Please refer to
| ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
| :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
| for basic usage of these attributes.
|
| See Also
| --------
| DecisionTreeRegressor : A decision tree regressor.
|
| Notes
| -----
| The default values for the parameters controlling the size of the trees
| (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
| unpruned trees which can potentially be very large on some data sets. To
| reduce memory consumption, the complexity and size of the trees should be
| controlled by setting those parameter values.
|
| The :meth:`predict` method operates using the :func:`numpy.argmax`
| function on the outputs of :meth:`predict_proba`. This means that in
| case the highest predicted probabilities are tied, the classifier will
| predict the tied class with the lowest index in :term:`classes_`.
|
| References
| ----------
|
| .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
|
| .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
| and Regression Trees", Wadsworth, Belmont, CA, 1984.
|
| .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
| Learning", Springer, 2009.
|
| .. [4] L. Breiman, and A. Cutler, "Random Forests",
| https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
|
| Examples
| --------
| >>> from sklearn.datasets import load_iris
| >>> from sklearn.model_selection import cross_val_score
| >>> from sklearn.tree import DecisionTreeClassifier
| >>> clf = DecisionTreeClassifier(random_state=0)
| >>> iris = load_iris()
| >>> cross_val_score(clf, iris.data, iris.target, cv=10)
| ... # doctest: +SKIP
| ...
| array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
| 0.93..., 0.93..., 1. , 0.93..., 1. ])
|
| Method resolution order:
| DecisionTreeClassifier
| sklearn.base.ClassifierMixin
| BaseDecisionTree
| sklearn.base.MultiOutputMixin
| sklearn.base.BaseEstimator
| builtins.object
|
| Methods defined here:
|
| __init__(self, *, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
| Initialize self. See help(type(self)) for accurate signature.
|
| fit(self, X, y, sample_weight=None, check_input=True)
| Build a decision tree classifier from the training set (X, y).
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The training input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csc_matrix``.
|
| y : array-like of shape (n_samples,) or (n_samples, n_outputs)
| The target values (class labels) as integers or strings.
|
| sample_weight : array-like of shape (n_samples,), default=None
| Sample weights. If None, then samples are equally weighted. Splits
| that would create child nodes with net zero or negative weight are
| ignored while searching for a split in each node. Splits are also
| ignored if they would result in any single class carrying a
| negative weight in either child node.
|
| check_input : bool, default=True
| Allow to bypass several input checking.
| Don't use this parameter unless you know what you're doing.
|
| Returns
| -------
| self : DecisionTreeClassifier
| Fitted estimator.
|
| predict_log_proba(self, X)
| Predict class log-probabilities of the input samples X.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csr_matrix``.
|
| Returns
| -------
| proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1
| The class log-probabilities of the input samples. The order of the
| classes corresponds to that in the attribute :term:`classes_`.
|
| predict_proba(self, X, check_input=True)
| Predict class probabilities of the input samples X.
|
| The predicted class probability is the fraction of samples of the same
| class in a leaf.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csr_matrix``.
|
| check_input : bool, default=True
| Allow to bypass several input checking.
| Don't use this parameter unless you know what you're doing.
|
| Returns
| -------
| proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1
| The class probabilities of the input samples. The order of the
| classes corresponds to that in the attribute :term:`classes_`.
|
| ----------------------------------------------------------------------
| Data and other attributes defined here:
|
| __abstractmethods__ = frozenset()
|
| __annotations__ = {'_parameter_constraints': <class 'dict'>}
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.base.ClassifierMixin:
|
| score(self, X, y, sample_weight=None)
| Return the mean accuracy on the given test data and labels.
|
| In multi-label classification, this is the subset accuracy
| which is a harsh metric since you require for each sample that
| each label set be correctly predicted.
|
| Parameters
| ----------
| X : array-like of shape (n_samples, n_features)
| Test samples.
|
| y : array-like of shape (n_samples,) or (n_samples, n_outputs)
| True labels for `X`.
|
| sample_weight : array-like of shape (n_samples,), default=None
| Sample weights.
|
| Returns
| -------
| score : float
| Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
|
| ----------------------------------------------------------------------
| Data descriptors inherited from sklearn.base.ClassifierMixin:
|
| __dict__
| dictionary for instance variables
|
| __weakref__
| list of weak references to the object
|
| ----------------------------------------------------------------------
| Methods inherited from BaseDecisionTree:
|
| apply(self, X, check_input=True)
| Return the index of the leaf that each sample is predicted as.
|
| .. versionadded:: 0.17
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csr_matrix``.
|
| check_input : bool, default=True
| Allow to bypass several input checking.
| Don't use this parameter unless you know what you're doing.
|
| Returns
| -------
| X_leaves : array-like of shape (n_samples,)
| For each datapoint x in X, return the index of the leaf x
| ends up in. Leaves are numbered within
| ``[0; self.tree_.node_count)``, possibly with gaps in the
| numbering.
|
| cost_complexity_pruning_path(self, X, y, sample_weight=None)
| Compute the pruning path during Minimal Cost-Complexity Pruning.
|
| See :ref:`minimal_cost_complexity_pruning` for details on the pruning
| process.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The training input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csc_matrix``.
|
| y : array-like of shape (n_samples,) or (n_samples, n_outputs)
| The target values (class labels) as integers or strings.
|
| sample_weight : array-like of shape (n_samples,), default=None
| Sample weights. If None, then samples are equally weighted. Splits
| that would create child nodes with net zero or negative weight are
| ignored while searching for a split in each node. Splits are also
| ignored if they would result in any single class carrying a
| negative weight in either child node.
|
| Returns
| -------
| ccp_path : :class:`~sklearn.utils.Bunch`
| Dictionary-like object, with the following attributes.
|
| ccp_alphas : ndarray
| Effective alphas of subtree during pruning.
|
| impurities : ndarray
| Sum of the impurities of the subtree leaves for the
| corresponding alpha value in ``ccp_alphas``.
|
| decision_path(self, X, check_input=True)
| Return the decision path in the tree.
|
| .. versionadded:: 0.18
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csr_matrix``.
|
| check_input : bool, default=True
| Allow to bypass several input checking.
| Don't use this parameter unless you know what you're doing.
|
| Returns
| -------
| indicator : sparse matrix of shape (n_samples, n_nodes)
| Return a node indicator CSR matrix where non zero elements
| indicates that the samples goes through the nodes.
|
| get_depth(self)
| Return the depth of the decision tree.
|
| The depth of a tree is the maximum distance between the root
| and any leaf.
|
| Returns
| -------
| self.tree_.max_depth : int
| The maximum depth of the tree.
|
| get_n_leaves(self)
| Return the number of leaves of the decision tree.
|
| Returns
| -------
| self.tree_.n_leaves : int
| Number of leaves.
|
| predict(self, X, check_input=True)
| Predict class or regression value for X.
|
| For a classification model, the predicted class for each sample in X is
| returned. For a regression model, the predicted value based on X is
| returned.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, it will be converted to
| ``dtype=np.float32`` and if a sparse matrix is provided
| to a sparse ``csr_matrix``.
|
| check_input : bool, default=True
| Allow to bypass several input checking.
| Don't use this parameter unless you know what you're doing.
|
| Returns
| -------
| y : array-like of shape (n_samples,) or (n_samples, n_outputs)
| The predicted classes, or the predict values.
|
| ----------------------------------------------------------------------
| Readonly properties inherited from BaseDecisionTree:
|
| feature_importances_
| Return the feature importances.
|
| The importance of a feature is computed as the (normalized) total
| reduction of the criterion brought by that feature.
| It is also known as the Gini importance.
|
| Warning: impurity-based feature importances can be misleading for
| high cardinality features (many unique values). See
| :func:`sklearn.inspection.permutation_importance` as an alternative.
|
| Returns
| -------
| feature_importances_ : ndarray of shape (n_features,)
| Normalized total reduction of criteria by feature
| (Gini importance).
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.base.BaseEstimator:
|
| __getstate__(self)
| Helper for pickle.
|
| __repr__(self, N_CHAR_MAX=700)
| Return repr(self).
|
| __setstate__(self, state)
|
| get_params(self, deep=True)
| Get parameters for this estimator.
|
| Parameters
| ----------
| deep : bool, default=True
| If True, will return the parameters for this estimator and
| contained subobjects that are estimators.
|
| Returns
| -------
| params : dict
| Parameter names mapped to their values.
|
| set_params(self, **params)
| Set the parameters of this estimator.
|
| The method works on simple estimators as well as on nested objects
| (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
| parameters of the form ``<component>__<parameter>`` so that it's
| possible to update each component of a nested object.
|
| Parameters
| ----------
| **params : dict
| Estimator parameters.
|
| Returns
| -------
| self : estimator instance
| Estimator instance.
# fit the model
from sklearn.datasets import load_breast_cancer
dataset_bc = load_breast_cancer()
X_bc = load_breast_cancer().data
y_bc = load_breast_cancer().target
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size=0.33, random_state=42)
dtc.fit(X_train_bc, y_train_bc)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
# train test scores
dtc.score(X_train_bc, y_train_bc), dtc.score(X_test_bc, y_test_bc)
(1.0, 0.8936170212765957)
# sketch the tree
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')
tree.plot_tree(dtc, filled=True, class_names=dataset_bc.target_names, feature_names=dataset_bc.feature_names, ax=ax);
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset_bc.feature_names, dtc.feature_importances_)
plt.xticks(rotation=90);
Important Hyperparameter
max_depth : int, default=None
The maximum depth of the tree.
If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
# max_depth=2
dtc2 = DecisionTreeClassifier(max_depth=2)
dtc2.fit(X_train_bc, y_train_bc)
DecisionTreeClassifier(max_depth=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=2)
# train test scores
dtc2.score(X_train_bc, y_train_bc), dtc2.score(X_test_bc, y_test_bc)
(0.9448818897637795, 0.925531914893617)
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')
tree.plot_tree(dtc2, filled=True, class_names=dataset_bc.target_names, feature_names=dataset_bc.feature_names, ax=ax);
# feature importances
dtc2.feature_importances_
array([0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.88963542, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.0410358 , 0. , 0.06932878, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ])
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset_bc.feature_names, dtc2.feature_importances_)
plt.xticks(rotation=90);
Desicion Tree Regressor#
# instantiate the class into an object
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
# fit the model
dtr.fit(X_train , y_train )
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
# scores
dtr.score(X_train , y_train ), dtr.score(X_test , y_test )
(1.0, 0.5931229184969595)
# sketch the tree
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')
tree.plot_tree(dtr, filled=True, feature_names=dataset.feature_names, ax=ax);
Error in callback <function _draw_all_if_interactive at 0x1213a3d80> (for post_execute), with arguments args (),kwargs {}:
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/pyplot.py:197, in _draw_all_if_interactive()
195 def _draw_all_if_interactive() -> None:
196 if matplotlib.is_interactive():
--> 197 draw_all()
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/_pylab_helpers.py:132, in Gcf.draw_all(cls, force)
130 for manager in cls.get_all_fig_managers():
131 if force or manager.canvas.figure.stale:
--> 132 manager.canvas.draw_idle()
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/backend_bases.py:1893, in FigureCanvasBase.draw_idle(self, *args, **kwargs)
1891 if not self._is_idle_drawing:
1892 with self._idle_draw_cntx():
-> 1893 self.draw(*args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/backends/backend_agg.py:388, in FigureCanvasAgg.draw(self)
385 # Acquire a lock on the shared font cache.
386 with (self.toolbar._wait_cursor_for_draw_cm() if self.toolbar
387 else nullcontext()):
--> 388 self.figure.draw(self.renderer)
389 # A GUI class may be need to update a window using this draw, so
390 # don't forget to call the superclass.
391 super().draw()
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:95, in _finalize_rasterization.<locals>.draw_wrapper(artist, renderer, *args, **kwargs)
93 @wraps(draw)
94 def draw_wrapper(artist, renderer, *args, **kwargs):
---> 95 result = draw(artist, renderer, *args, **kwargs)
96 if renderer._rasterizing:
97 renderer.stop_rasterizing()
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
69 if artist.get_agg_filter() is not None:
70 renderer.start_filter()
---> 72 return draw(artist, renderer)
73 finally:
74 if artist.get_agg_filter() is not None:
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/figure.py:3154, in Figure.draw(self, renderer)
3151 # ValueError can occur when resizing a window.
3153 self.patch.draw(renderer)
-> 3154 mimage._draw_list_compositing_images(
3155 renderer, self, artists, self.suppressComposite)
3157 for sfig in self.subfigs:
3158 sfig.draw(renderer)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/image.py:132, in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
130 if not_composite or not has_images:
131 for a in artists:
--> 132 a.draw(renderer)
133 else:
134 # Composite any adjacent images together
135 image_group = []
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
69 if artist.get_agg_filter() is not None:
70 renderer.start_filter()
---> 72 return draw(artist, renderer)
73 finally:
74 if artist.get_agg_filter() is not None:
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/axes/_base.py:3070, in _AxesBase.draw(self, renderer)
3067 if artists_rasterized:
3068 _draw_rasterized(self.figure, artists_rasterized, renderer)
-> 3070 mimage._draw_list_compositing_images(
3071 renderer, self, artists, self.figure.suppressComposite)
3073 renderer.close_group('axes')
3074 self.stale = False
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/image.py:132, in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
130 if not_composite or not has_images:
131 for a in artists:
--> 132 a.draw(renderer)
133 else:
134 # Composite any adjacent images together
135 image_group = []
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
69 if artist.get_agg_filter() is not None:
70 renderer.start_filter()
---> 72 return draw(artist, renderer)
73 finally:
74 if artist.get_agg_filter() is not None:
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/text.py:1988, in Annotation.draw(self, renderer)
1986 if self.arrow_patch.figure is None and self.figure is not None:
1987 self.arrow_patch.figure = self.figure
-> 1988 self.arrow_patch.draw(renderer)
1989 # Draw text, including FancyBboxPatch, after FancyArrowPatch.
1990 # Otherwise, a wedge arrowstyle can land partly on top of the Bbox.
1991 Text.draw(self, renderer)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:39, in _prevent_rasterization.<locals>.draw_wrapper(artist, renderer, *args, **kwargs)
36 renderer.stop_rasterizing()
37 renderer._rasterizing = False
---> 39 return draw(artist, renderer, *args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:4380, in FancyArrowPatch.draw(self, renderer)
4376 # FIXME: dpi_cor is for the dpi-dependency of the linewidth. There
4377 # could be room for improvement. Maybe _get_path_in_displaycoord could
4378 # take a renderer argument, but get_path should be adapted too.
4379 self._dpi_cor = renderer.points_to_pixels(1.)
-> 4380 path, fillable = self._get_path_in_displaycoord()
4382 if not np.iterable(fillable):
4383 path = [path]
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:4355, in FancyArrowPatch._get_path_in_displaycoord(self)
4353 posB = self._convert_xy_units(self._posA_posB[1])
4354 (posA, posB) = self.get_transform().transform((posA, posB))
-> 4355 _path = self.get_connectionstyle()(posA, posB,
4356 patchA=self.patchA,
4357 patchB=self.patchB,
4358 shrinkA=self.shrinkA * dpi_cor,
4359 shrinkB=self.shrinkB * dpi_cor
4360 )
4361 else:
4362 _path = self.get_transform().transform_path(self._path_original)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:2751, in ConnectionStyle._Base.__call__(self, posA, posB, shrinkA, shrinkB, patchA, patchB)
2746 """
2747 Call the *connect* method to create a path between *posA* and
2748 *posB*; then clip and shrink the path.
2749 """
2750 path = self.connect(posA, posB)
-> 2751 path = self._clip(
2752 path,
2753 self._in_patch(patchA) if patchA else None,
2754 self._in_patch(patchB) if patchB else None,
2755 )
2756 path = self._clip(
2757 path,
2758 inside_circle(*path.vertices[0], shrinkA) if shrinkA else None,
2759 inside_circle(*path.vertices[-1], shrinkB) if shrinkB else None
2760 )
2761 return path
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:2734, in ConnectionStyle._Base._clip(self, path, in_start, in_stop)
2732 if in_start:
2733 try:
-> 2734 _, path = split_path_inout(path, in_start)
2735 except ValueError:
2736 pass
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/bezier.py:370, in split_path_inout(path, inside, tolerance, reorder_inout)
367 raise ValueError("The path does not intersect with the patch")
369 bp = bezier_path.reshape((-1, 2))
--> 370 left, right = split_bezier_intersecting_with_closedpath(
371 bp, inside, tolerance)
372 if len(left) == 2:
373 codes_left = [Path.LINETO]
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/bezier.py:333, in split_bezier_intersecting_with_closedpath(bezier, inside_closedpath, tolerance)
330 bz = BezierSegment(bezier)
331 bezier_point_at_t = bz.point_at_t
--> 333 t0, t1 = find_bezier_t_intersecting_with_closedpath(
334 bezier_point_at_t, inside_closedpath, tolerance=tolerance)
336 _left, _right = split_de_casteljau(bezier, (t0 + t1) / 2.)
337 return _left, _right
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/bezier.py:170, in find_bezier_t_intersecting_with_closedpath(bezier_point_at_t, inside_closedpath, t0, t1, tolerance)
168 middle_t = 0.5 * (t0 + t1)
169 middle = bezier_point_at_t(middle_t)
--> 170 middle_inside = inside_closedpath(middle)
172 if start_inside ^ middle_inside:
173 t1 = middle_t
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:2721, in ConnectionStyle._Base._in_patch.<locals>.<lambda>(xy)
2716 def _in_patch(self, patch):
2717 """
2718 Return a predicate function testing whether a point *xy* is
2719 contained in *patch*.
2720 """
-> 2721 return lambda xy: patch.contains(
2722 SimpleNamespace(x=xy[0], y=xy[1]))[0]
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:149, in Patch.contains(self, mouseevent, radius)
147 else:
148 subpaths = [self.get_path()]
--> 149 inside = any(
150 subpath.contains_point(
151 (mouseevent.x, mouseevent.y), self.get_transform(), radius)
152 for subpath in subpaths)
153 return inside, {}
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/patches.py:150, in <genexpr>(.0)
147 else:
148 subpaths = [self.get_path()]
149 inside = any(
--> 150 subpath.contains_point(
151 (mouseevent.x, mouseevent.y), self.get_transform(), radius)
152 for subpath in subpaths)
153 return inside, {}
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/path.py:542, in Path.contains_point(self, point, transform, radius)
503 """
504 Return whether the area enclosed by the path contains the given point.
505
(...)
539 the result is not guaranteed to be correct.
540 """
541 if transform is not None:
--> 542 transform = transform.frozen()
543 # `point_in_path` does not handle nonlinear transforms, so we
544 # transform the path ourselves. If *transform* is affine, letting
545 # `point_in_path` handle the transform avoids allocating an extra
546 # buffer.
547 if transform and not transform.is_affine:
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/transforms.py:1835, in Affine2DBase.frozen(self)
1833 def frozen(self):
1834 # docstring inherited
-> 1835 return Affine2D(self.get_matrix().copy())
KeyboardInterrupt:
# feature importances
dtr.feature_importances_
array([0.52237857, 0.06006469, 0.05320339, 0.02897168, 0.03106369,
0.13288264, 0.08741881, 0.08401653])
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, dtr.feature_importances_)
plt.xticks(rotation=90);
# max_depth=2
dtr2 = DecisionTreeRegressor(max_depth=2)
# fit the model
dtr2.fit(X_train , y_train )
DecisionTreeRegressor(max_depth=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=2)
# skecth the tree
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')
tree.plot_tree(dtr2, filled=True, feature_names=dataset.feature_names, ax=ax);
# feature importances
dtr2.feature_importances_
array([1., 0., 0., 0., 0., 0., 0., 0.])
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, dtr2.feature_importances_)
plt.xticks(rotation=90);
Random Forest#
ensemble of decision trees for classification and regression
usually trained with the “bagging” method
train with randomly chosen training dataset
builds multiple decision trees and merges them together to get a more accurate and stable prediction adds additional randomness to the model, while growing the trees.
Final Desicion:
Soft voting for classification
Average for regression
Random Forest Classifier#
# instantiate the class into an object
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
# help
help(rfc)
Help on RandomForestClassifier in module sklearn.ensemble._forest object:
class RandomForestClassifier(ForestClassifier)
| RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
|
| A random forest classifier.
|
| A random forest is a meta estimator that fits a number of decision tree
| classifiers on various sub-samples of the dataset and uses averaging to
| improve the predictive accuracy and control over-fitting.
| The sub-sample size is controlled with the `max_samples` parameter if
| `bootstrap=True` (default), otherwise the whole dataset is used to build
| each tree.
|
| Read more in the :ref:`User Guide <forest>`.
|
| Parameters
| ----------
| n_estimators : int, default=100
| The number of trees in the forest.
|
| .. versionchanged:: 0.22
| The default value of ``n_estimators`` changed from 10 to 100
| in 0.22.
|
| criterion : {"gini", "entropy", "log_loss"}, default="gini"
| The function to measure the quality of a split. Supported criteria are
| "gini" for the Gini impurity and "log_loss" and "entropy" both for the
| Shannon information gain, see :ref:`tree_mathematical_formulation`.
| Note: This parameter is tree-specific.
|
| max_depth : int, default=None
| The maximum depth of the tree. If None, then nodes are expanded until
| all leaves are pure or until all leaves contain less than
| min_samples_split samples.
|
| min_samples_split : int or float, default=2
| The minimum number of samples required to split an internal node:
|
| - If int, then consider `min_samples_split` as the minimum number.
| - If float, then `min_samples_split` is a fraction and
| `ceil(min_samples_split * n_samples)` are the minimum
| number of samples for each split.
|
| .. versionchanged:: 0.18
| Added float values for fractions.
|
| min_samples_leaf : int or float, default=1
| The minimum number of samples required to be at a leaf node.
| A split point at any depth will only be considered if it leaves at
| least ``min_samples_leaf`` training samples in each of the left and
| right branches. This may have the effect of smoothing the model,
| especially in regression.
|
| - If int, then consider `min_samples_leaf` as the minimum number.
| - If float, then `min_samples_leaf` is a fraction and
| `ceil(min_samples_leaf * n_samples)` are the minimum
| number of samples for each node.
|
| .. versionchanged:: 0.18
| Added float values for fractions.
|
| min_weight_fraction_leaf : float, default=0.0
| The minimum weighted fraction of the sum total of weights (of all
| the input samples) required to be at a leaf node. Samples have
| equal weight when sample_weight is not provided.
|
| max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
| The number of features to consider when looking for the best split:
|
| - If int, then consider `max_features` features at each split.
| - If float, then `max_features` is a fraction and
| `max(1, int(max_features * n_features_in_))` features are considered at each
| split.
| - If "auto", then `max_features=sqrt(n_features)`.
| - If "sqrt", then `max_features=sqrt(n_features)`.
| - If "log2", then `max_features=log2(n_features)`.
| - If None, then `max_features=n_features`.
|
| .. versionchanged:: 1.1
| The default of `max_features` changed from `"auto"` to `"sqrt"`.
|
| .. deprecated:: 1.1
| The `"auto"` option was deprecated in 1.1 and will be removed
| in 1.3.
|
| Note: the search for a split does not stop until at least one
| valid partition of the node samples is found, even if it requires to
| effectively inspect more than ``max_features`` features.
|
| max_leaf_nodes : int, default=None
| Grow trees with ``max_leaf_nodes`` in best-first fashion.
| Best nodes are defined as relative reduction in impurity.
| If None then unlimited number of leaf nodes.
|
| min_impurity_decrease : float, default=0.0
| A node will be split if this split induces a decrease of the impurity
| greater than or equal to this value.
|
| The weighted impurity decrease equation is the following::
|
| N_t / N * (impurity - N_t_R / N_t * right_impurity
| - N_t_L / N_t * left_impurity)
|
| where ``N`` is the total number of samples, ``N_t`` is the number of
| samples at the current node, ``N_t_L`` is the number of samples in the
| left child, and ``N_t_R`` is the number of samples in the right child.
|
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
| if ``sample_weight`` is passed.
|
| .. versionadded:: 0.19
|
| bootstrap : bool, default=True
| Whether bootstrap samples are used when building trees. If False, the
| whole dataset is used to build each tree.
|
| oob_score : bool, default=False
| Whether to use out-of-bag samples to estimate the generalization score.
| Only available if bootstrap=True.
|
| n_jobs : int, default=None
| The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
| :meth:`decision_path` and :meth:`apply` are all parallelized over the
| trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
| context. ``-1`` means using all processors. See :term:`Glossary
| <n_jobs>` for more details.
|
| random_state : int, RandomState instance or None, default=None
| Controls both the randomness of the bootstrapping of the samples used
| when building trees (if ``bootstrap=True``) and the sampling of the
| features to consider when looking for the best split at each node
| (if ``max_features < n_features``).
| See :term:`Glossary <random_state>` for details.
|
| verbose : int, default=0
| Controls the verbosity when fitting and predicting.
|
| warm_start : bool, default=False
| When set to ``True``, reuse the solution of the previous call to fit
| and add more estimators to the ensemble, otherwise, just fit a whole
| new forest. See :term:`Glossary <warm_start>` and
| :ref:`gradient_boosting_warm_start` for details.
|
| class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, default=None
| Weights associated with classes in the form ``{class_label: weight}``.
| If not given, all classes are supposed to have weight one. For
| multi-output problems, a list of dicts can be provided in the same
| order as the columns of y.
|
| Note that for multioutput (including multilabel) weights should be
| defined for each class of every column in its own dict. For example,
| for four-class multilabel classification weights should be
| [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
| [{1:1}, {2:5}, {3:1}, {4:1}].
|
| The "balanced" mode uses the values of y to automatically adjust
| weights inversely proportional to class frequencies in the input data
| as ``n_samples / (n_classes * np.bincount(y))``
|
| The "balanced_subsample" mode is the same as "balanced" except that
| weights are computed based on the bootstrap sample for every tree
| grown.
|
| For multi-output, the weights of each column of y will be multiplied.
|
| Note that these weights will be multiplied with sample_weight (passed
| through the fit method) if sample_weight is specified.
|
| ccp_alpha : non-negative float, default=0.0
| Complexity parameter used for Minimal Cost-Complexity Pruning. The
| subtree with the largest cost complexity that is smaller than
| ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
| :ref:`minimal_cost_complexity_pruning` for details.
|
| .. versionadded:: 0.22
|
| max_samples : int or float, default=None
| If bootstrap is True, the number of samples to draw from X
| to train each base estimator.
|
| - If None (default), then draw `X.shape[0]` samples.
| - If int, then draw `max_samples` samples.
| - If float, then draw `max_samples * X.shape[0]` samples. Thus,
| `max_samples` should be in the interval `(0.0, 1.0]`.
|
| .. versionadded:: 0.22
|
| Attributes
| ----------
| estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
| The child estimator template used to create the collection of fitted
| sub-estimators.
|
| .. versionadded:: 1.2
| `base_estimator_` was renamed to `estimator_`.
|
| base_estimator_ : DecisionTreeClassifier
| The child estimator template used to create the collection of fitted
| sub-estimators.
|
| .. deprecated:: 1.2
| `base_estimator_` is deprecated and will be removed in 1.4.
| Use `estimator_` instead.
|
| estimators_ : list of DecisionTreeClassifier
| The collection of fitted sub-estimators.
|
| classes_ : ndarray of shape (n_classes,) or a list of such arrays
| The classes labels (single output problem), or a list of arrays of
| class labels (multi-output problem).
|
| n_classes_ : int or list
| The number of classes (single output problem), or a list containing the
| number of classes for each output (multi-output problem).
|
| n_features_in_ : int
| Number of features seen during :term:`fit`.
|
| .. versionadded:: 0.24
|
| feature_names_in_ : ndarray of shape (`n_features_in_`,)
| Names of features seen during :term:`fit`. Defined only when `X`
| has feature names that are all strings.
|
| .. versionadded:: 1.0
|
| n_outputs_ : int
| The number of outputs when ``fit`` is performed.
|
| feature_importances_ : ndarray of shape (n_features,)
| The impurity-based feature importances.
| The higher, the more important the feature.
| The importance of a feature is computed as the (normalized)
| total reduction of the criterion brought by that feature. It is also
| known as the Gini importance.
|
| Warning: impurity-based feature importances can be misleading for
| high cardinality features (many unique values). See
| :func:`sklearn.inspection.permutation_importance` as an alternative.
|
| oob_score_ : float
| Score of the training dataset obtained using an out-of-bag estimate.
| This attribute exists only when ``oob_score`` is True.
|
| oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)
| Decision function computed with out-of-bag estimate on the training
| set. If n_estimators is small it might be possible that a data point
| was never left out during the bootstrap. In this case,
| `oob_decision_function_` might contain NaN. This attribute exists
| only when ``oob_score`` is True.
|
| See Also
| --------
| sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
| sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized
| tree classifiers.
|
| Notes
| -----
| The default values for the parameters controlling the size of the trees
| (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
| unpruned trees which can potentially be very large on some data sets. To
| reduce memory consumption, the complexity and size of the trees should be
| controlled by setting those parameter values.
|
| The features are always randomly permuted at each split. Therefore,
| the best found split may vary, even with the same training data,
| ``max_features=n_features`` and ``bootstrap=False``, if the improvement
| of the criterion is identical for several splits enumerated during the
| search of the best split. To obtain a deterministic behaviour during
| fitting, ``random_state`` has to be fixed.
|
| References
| ----------
| .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
| Examples
| --------
| >>> from sklearn.ensemble import RandomForestClassifier
| >>> from sklearn.datasets import make_classification
| >>> X, y = make_classification(n_samples=1000, n_features=4,
| ... n_informative=2, n_redundant=0,
| ... random_state=0, shuffle=False)
| >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
| >>> clf.fit(X, y)
| RandomForestClassifier(...)
| >>> print(clf.predict([[0, 0, 0, 0]]))
| [1]
|
| Method resolution order:
| RandomForestClassifier
| ForestClassifier
| sklearn.base.ClassifierMixin
| BaseForest
| sklearn.base.MultiOutputMixin
| sklearn.ensemble._base.BaseEnsemble
| sklearn.base.MetaEstimatorMixin
| sklearn.base.BaseEstimator
| builtins.object
|
| Methods defined here:
|
| __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
| Initialize self. See help(type(self)) for accurate signature.
|
| ----------------------------------------------------------------------
| Data and other attributes defined here:
|
| __abstractmethods__ = frozenset()
|
| __annotations__ = {'_parameter_constraints': <class 'dict'>}
|
| ----------------------------------------------------------------------
| Methods inherited from ForestClassifier:
|
| predict(self, X)
| Predict class for X.
|
| The predicted class of an input sample is a vote by the trees in
| the forest, weighted by their probability estimates. That is,
| the predicted class is the one with highest mean probability
| estimate across the trees.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, its dtype will be converted to
| ``dtype=np.float32``. If a sparse matrix is provided, it will be
| converted into a sparse ``csr_matrix``.
|
| Returns
| -------
| y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
| The predicted classes.
|
| predict_log_proba(self, X)
| Predict class log-probabilities for X.
|
| The predicted class log-probabilities of an input sample is computed as
| the log of the mean predicted class probabilities of the trees in the
| forest.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, its dtype will be converted to
| ``dtype=np.float32``. If a sparse matrix is provided, it will be
| converted into a sparse ``csr_matrix``.
|
| Returns
| -------
| p : ndarray of shape (n_samples, n_classes), or a list of such arrays
| The class probabilities of the input samples. The order of the
| classes corresponds to that in the attribute :term:`classes_`.
|
| predict_proba(self, X)
| Predict class probabilities for X.
|
| The predicted class probabilities of an input sample are computed as
| the mean predicted class probabilities of the trees in the forest.
| The class probability of a single tree is the fraction of samples of
| the same class in a leaf.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, its dtype will be converted to
| ``dtype=np.float32``. If a sparse matrix is provided, it will be
| converted into a sparse ``csr_matrix``.
|
| Returns
| -------
| p : ndarray of shape (n_samples, n_classes), or a list of such arrays
| The class probabilities of the input samples. The order of the
| classes corresponds to that in the attribute :term:`classes_`.
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.base.ClassifierMixin:
|
| score(self, X, y, sample_weight=None)
| Return the mean accuracy on the given test data and labels.
|
| In multi-label classification, this is the subset accuracy
| which is a harsh metric since you require for each sample that
| each label set be correctly predicted.
|
| Parameters
| ----------
| X : array-like of shape (n_samples, n_features)
| Test samples.
|
| y : array-like of shape (n_samples,) or (n_samples, n_outputs)
| True labels for `X`.
|
| sample_weight : array-like of shape (n_samples,), default=None
| Sample weights.
|
| Returns
| -------
| score : float
| Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
|
| ----------------------------------------------------------------------
| Data descriptors inherited from sklearn.base.ClassifierMixin:
|
| __dict__
| dictionary for instance variables
|
| __weakref__
| list of weak references to the object
|
| ----------------------------------------------------------------------
| Methods inherited from BaseForest:
|
| apply(self, X)
| Apply trees in the forest to X, return leaf indices.
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, its dtype will be converted to
| ``dtype=np.float32``. If a sparse matrix is provided, it will be
| converted into a sparse ``csr_matrix``.
|
| Returns
| -------
| X_leaves : ndarray of shape (n_samples, n_estimators)
| For each datapoint x in X and for each tree in the forest,
| return the index of the leaf x ends up in.
|
| decision_path(self, X)
| Return the decision path in the forest.
|
| .. versionadded:: 0.18
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The input samples. Internally, its dtype will be converted to
| ``dtype=np.float32``. If a sparse matrix is provided, it will be
| converted into a sparse ``csr_matrix``.
|
| Returns
| -------
| indicator : sparse matrix of shape (n_samples, n_nodes)
| Return a node indicator matrix where non zero elements indicates
| that the samples goes through the nodes. The matrix is of CSR
| format.
|
| n_nodes_ptr : ndarray of shape (n_estimators + 1,)
| The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
| gives the indicator value for the i-th estimator.
|
| fit(self, X, y, sample_weight=None)
| Build a forest of trees from the training set (X, y).
|
| Parameters
| ----------
| X : {array-like, sparse matrix} of shape (n_samples, n_features)
| The training input samples. Internally, its dtype will be converted
| to ``dtype=np.float32``. If a sparse matrix is provided, it will be
| converted into a sparse ``csc_matrix``.
|
| y : array-like of shape (n_samples,) or (n_samples, n_outputs)
| The target values (class labels in classification, real numbers in
| regression).
|
| sample_weight : array-like of shape (n_samples,), default=None
| Sample weights. If None, then samples are equally weighted. Splits
| that would create child nodes with net zero or negative weight are
| ignored while searching for a split in each node. In the case of
| classification, splits are also ignored if they would result in any
| single class carrying a negative weight in either child node.
|
| Returns
| -------
| self : object
| Fitted estimator.
|
| ----------------------------------------------------------------------
| Readonly properties inherited from BaseForest:
|
| feature_importances_
| The impurity-based feature importances.
|
| The higher, the more important the feature.
| The importance of a feature is computed as the (normalized)
| total reduction of the criterion brought by that feature. It is also
| known as the Gini importance.
|
| Warning: impurity-based feature importances can be misleading for
| high cardinality features (many unique values). See
| :func:`sklearn.inspection.permutation_importance` as an alternative.
|
| Returns
| -------
| feature_importances_ : ndarray of shape (n_features,)
| The values of this array sum to 1, unless all trees are single node
| trees consisting of only the root node, in which case it will be an
| array of zeros.
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.ensemble._base.BaseEnsemble:
|
| __getitem__(self, index)
| Return the index'th estimator in the ensemble.
|
| __iter__(self)
| Return iterator over estimators in the ensemble.
|
| __len__(self)
| Return the number of estimators in the ensemble.
|
| ----------------------------------------------------------------------
| Readonly properties inherited from sklearn.ensemble._base.BaseEnsemble:
|
| base_estimator_
| Estimator used to grow the ensemble.
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.base.BaseEstimator:
|
| __getstate__(self)
| Helper for pickle.
|
| __repr__(self, N_CHAR_MAX=700)
| Return repr(self).
|
| __setstate__(self, state)
|
| get_params(self, deep=True)
| Get parameters for this estimator.
|
| Parameters
| ----------
| deep : bool, default=True
| If True, will return the parameters for this estimator and
| contained subobjects that are estimators.
|
| Returns
| -------
| params : dict
| Parameter names mapped to their values.
|
| set_params(self, **params)
| Set the parameters of this estimator.
|
| The method works on simple estimators as well as on nested objects
| (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
| parameters of the form ``<component>__<parameter>`` so that it's
| possible to update each component of a nested object.
|
| Parameters
| ----------
| **params : dict
| Estimator parameters.
|
| Returns
| -------
| self : estimator instance
| Estimator instance.
# fit the model
rfc.fit(X_train_bc, y_train_bc)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
# train test scores
rfc.score(X_train_bc, y_train_bc), rfc.score(X_test_bc, y_test_bc)
(1.0, 0.9521276595744681)
Important Hyperparameter
n_estimators : integer, optional (default=100)
The number of trees in the forest.
# n_estimator=5
rfc5 = RandomForestClassifier(n_estimators=5)
# fit the model
rfc5.fit(X_train_bc, y_train_bc)
RandomForestClassifier(n_estimators=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=5)
# train test scores
rfc5.score(X_train_bc, y_train_bc), rfc5.score(X_test_bc, y_test_bc)
(1.0, 0.9414893617021277)
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset_bc.feature_names, rfc5.feature_importances_)
plt.xticks(rotation=90);
Random Forest Regressor#
# instantiate the class into an object
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
# fit the model
rfr.fit(X_train , y_train )
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
# train test scores
rfr.score(X_train , y_train ), rfr.score(X_test , y_test )
(0.9722250046056299, 0.8014232972663238)
# n_estimator=5
rfr5 = RandomForestRegressor(n_estimators=5)
# fit the model
rfr5.fit(X_train , y_train )
RandomForestRegressor(n_estimators=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(n_estimators=5)
# train test scores
rfr5.score(X_train , y_train ), rfr5.score(X_test , y_test )
(0.9491468901205052, 0.7602286543954643)
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, rfr5.feature_importances_)
plt.xticks(rotation=90);
XGboost#
tree based
improved version of Random Forest
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)Logistic Regression#
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
X_bc = load_breast_cancer().data
y_bc = load_breast_cancer().target
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size=0.33, random_state=42)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_bc, y_train_bc)
LogisticRegression(max_iter=10000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=10000)
log_reg.score(X_train_bc, y_train_bc)
0.9606299212598425
log_reg.score(X_test_bc, y_test_bc)
0.9680851063829787
log_reg.predict(X_test_bc[:5])
array([1, 0, 0, 1, 1])
y_test_bc[:5]
array([1, 0, 0, 1, 1])
log_reg.predict_proba(X_test_bc[:5]).round(3)
array([[0.14 , 0.86 ],
[1. , 0. ],
[0.998, 0.002],
[0.002, 0.998],
[0. , 1. ]])